package com.luoxue.mito.crawler;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
import com.luoxue.mito.config.MitoConfig;
import com.luoxue.mito.dao.CategoryRepository;
import com.luoxue.mito.dao.MitoPostRepository;
import com.luoxue.mito.entity.Category;
import com.luoxue.mito.entity.MitoPost;
import com.luoxue.mito.util.GsonUtil;
import com.luoxue.mito.util.ImageUtil;
import com.luoxue.mito.util.RegexUtil;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;

import javax.annotation.Resource;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;

@Slf4j
@Component
public class VmGirlsCrawler extends BreadthCrawler {
    private static VmGirlsCrawlType vmGirlsCrawlType = VmGirlsCrawlType.start;
    private final static String url = "https://www.vmgirls.com";

    @Resource
    MitoConfig mitoConfig;
    @Resource
    private MitoPostRepository mitoPostRepository;
    @Resource
    private CategoryRepository categoryRepository;


    private final Map<String, Category> categoryMap = new ConcurrentHashMap<>();

    public VmGirlsCrawler() {
        super("crawl", false);

        /*start pages*/
        this.addSeed(String.format("%s/wp-sitemap-posts-post-1.xml", url));

        setThreads(10);
//        getConf().setTopN(30);

        //enable resumable mode
//        setResumable(true);
    }

    public synchronized static VmGirlsCrawlType getVmGirlsCrawlType() {
        return vmGirlsCrawlType;
    }

    public synchronized static void setVmGirlsCrawlType(VmGirlsCrawlType vmGirlsCrawlType) {
        VmGirlsCrawler.vmGirlsCrawlType = vmGirlsCrawlType;
    }

    @Override
    public void visit(Page page, CrawlDatums next) {
        Elements elements;
        if (page.matchUrl(String.format("%s/wp-sitemap-posts-post-1.xml", url))) {
            elements = page.select("loc");
            elements.forEach(element -> {
                String text = element.text();
                next.add(text);
            });
//            next.add(elements.get(0).text());
        } else if (page.matchUrl(String.format("%s/\\d+.html", url))) {
            String html = page.html();

            String category = page.selectText(".text-xx a[rel='category tag']");
            String title = RegexUtil.matcherText("title: '(.*?)'", html);
            String summary = RegexUtil.matcherText("summary: '([\\s\\S]*?)'", html);
            String pic = RegexUtil.matcherText("pic: '(.*?)'", html);

            Integer pageViews = Integer.valueOf(RegexUtil.matcherText("<span>阅读\\s+?([\\d,]+)\\s+?</span>", html).replaceAll(",", ""));
            Integer dataFlag = RegexUtil.matcherInt(String.format("%s/(\\d+).html", url), page.url());
            List<String> images = page.select(".post-content img").stream().map(element -> element.attr("src")).collect(Collectors.toList());
            List<String> tags = page.select("a[rel=\"tag\"]").stream().map(Element::text).collect(Collectors.toList());
            MitoPost mitoPost = new MitoPost(page.url(), title, category, summary, pic, pageViews, dataFlag, images, tags);

            if (mitoConfig.getEnableLocalUpload()) {
                try {
                    mitoPost.setServerPic(picDownload(pic));
                    List<String> stringList = images.stream().map(this::picDownload).filter(Objects::nonNull).collect(Collectors.toList());
                    if(!CollectionUtils.isEmpty(stringList)){
                        mitoPost.setServerImages(GsonUtil.GsonString(stringList));
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            tags.forEach(tag -> {
                String key = String.format("%s_%s", category, tag);
                if (!categoryMap.containsKey(key)) {
                    Category categoryBean = new Category(null, category, tag);
                    categoryMap.put(key, categoryBean);
                    categoryRepository.saveAndFlush(categoryBean);
                }
            });

            mitoPostRepository.saveAndFlush(mitoPost);
        }
    }

    private String picDownload(String url) {
        try {
            URL u = new URL(url);
            if (!url.equals(u.getPath())) {
//                FileUtil.downloadFileToServer(url, mitoConfig.getFilePath() + u.getPath());
                ImageUtil.saveFile(u, mitoConfig.getFilePath() + u.getPath());
                return u.getPath();
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        }
        return null;
    }


    @Override
    public void afterStop() {
        super.afterStop();
        categoryMap.clear();
        setVmGirlsCrawlType(VmGirlsCrawlType.done);
    }

}
